This document aims at exploring two datasets, one in 2016 on 6 individuals and another one in 2018 on 4 individuals. For that purpose, we need first to load the weanlingNES package to load data.
# load library
library(weanlingNES)
# load data
data("data_nes", package = "weanlingNES")
Let’s have a look at what’s inside data_nes$data_2016:
# list structure
str(data_nes$year_2016, max.level = 1, give.attr = F, no.list = T)
## $ ind_3449:Classes 'data.table' and 'data.frame': 384 obs. of 23 variables:
## $ ind_3450:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3456:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3457:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3460:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3463:Classes 'data.table' and 'data.frame': 213 obs. of 23 variables:
A list of 6
data.frames, one for each seal
For convenience, we aggregate all 6 individuals into one dataset.
# combine all individuals
data_2016 = rbindlist(data_nes$year_2016, use.name = TRUE, idcol = TRUE)
# display
DT::datatable(data_2016[sample.int(.N,100),],options=list(scrollX=T))
# raw_data
data_2016[, .(
nb_days_recorded = uniqueN(as.Date(date)),
max_depth = max(maxpress_dbars),
sst_mean = mean(sst2_c),
sst_sd = sd(sst2_c)
), by =.id] %>%
sable(caption="Summary diving information relative to each 2016 individual",
digits=2)
| .id | nb_days_recorded | max_depth | sst_mean | sst_sd |
|---|---|---|---|---|
| ind_3449 | 384 | 1118.81 | 26.54 | 129.91 |
| ind_3450 | 253 | 954.81 | 130.29 | 367.69 |
| ind_3456 | 253 | 697.63 | 125.24 | 360.39 |
| ind_3457 | 253 | 572.94 | 135.56 | 374.71 |
| ind_3460 | 253 | 832.25 | 65.24 | 249.12 |
| ind_3463 | 213 | 648.81 | 212.19 | 462.88 |
Well, it seems that sst is a bit odd. Let’s have a look at its distribution.
ggplot(data_2016, aes(x = sst2_c, fill = .id)) +
geom_histogram(show.legend = FALSE) +
facet_wrap(.id ~ .) +
theme_jjo()
Figure 1.1: Distribution of raw sst2 for the four individuals in 2016
Let’s remove any data with a sst2_c > 500.
data_2016_filter = data_2016[sst2_c < 500, ]
ggplot(data_2016_filter, aes(x = sst2_c, fill = .id)) +
geom_histogram(show.legend = FALSE) +
facet_wrap(.id ~ .) +
theme_jjo()
Figure 1.2: Distribution of filtered sst2 for the four individuals in 2016
Well, that seems to be much better! In the process of filtering out odd values, we removed 116 rows this way:
# nbrow removed
data_2016[sst2_c>500,.(nb_row_removed = .N),by=.id] %>%
sable(caption = "# of rows removed by 2016-individuals")
| .id | nb_row_removed |
|---|---|
| ind_3449 | 4 |
| ind_3450 | 23 |
| ind_3456 | 22 |
| ind_3457 | 24 |
| ind_3460 | 10 |
| ind_3463 | 33 |
# max depth
ggplot(data_2016,
aes(y = -maxpress_dbars, x=as.Date(date), col=.id)) +
geom_path(show.legend = FALSE) +
geom_point(data = data_2016[sst2_c>500,], col="black") +
scale_x_date(date_labels = "%m/%Y") +
labs(y="Pressure (dbar)", x="Date") +
facet_wrap(.id ~ .) +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust=1))
Figure 1.3: Where and when the sst2 outliers occured
Well this latter plot highlights several points:
ind_3449 seems to have return ashore twice during his trackLet’s see if we can double check that, using a map.
# interactive map
leaflet() %>%
setView(lng = -122, lat = 38, zoom = 2) %>%
addTiles() %>%
addPolylines(lat = data_2016[.id == "ind_3449", latitude_degs],
lng = data_2016[.id == "ind_3449", longitude_degs],
weight = 2) %>%
addCircleMarkers(lat = data_2016[.id == "ind_3449" & sst2_c>500, latitude_degs],
lng = data_2016[.id == "ind_3449" & sst2_c>500, longitude_degs],
radius = 3,
stroke=FALSE,
color="red",
fillOpacity=1)
Figure 1.4: It is supposed to be the track of ind_3449… (red dots are the location of removed rows)
… these coordinates seem weird !
# summary of the coordinates by individuals
data_2016[, .(.id, longitude_degs, latitude_degs)] %>%
tbl_summary(by = .id) %>%
modify_caption("Summary of `longitude_degree` and `latitude_degree`")
| Characteristic | ind_3449, N = 3841 | ind_3450, N = 2531 | ind_3456, N = 2531 | ind_3457, N = 2531 | ind_3460, N = 2531 | ind_3463, N = 2131 |
|---|---|---|---|---|---|---|
| longitude_degs | -119 (-132, -69) | -124 (-144, -99) | -112 (-134, -6) | -122 (-132, -97) | -122 (-144, -85) | -121 (-134, -93) |
| latitude_degs | 39 (-67, 68) | 60 (-63, 68) | 56 (-63, 68) | 44 (-63, 68) | 59 (-63, 68) | 63 (42, 72) |
|
1
Median (IQR)
|
||||||
# distribution coordinates
ggplot(
data = melt(data_2016[, .(Longitude = longitude_degs,
Latitude = latitude_degs,
.id)], id.vars =
".id", value.name = "Coordinate"),
aes(x = Coordinate, fill = .id)
) +
geom_histogram(show.legend = F) +
facet_grid(variable ~ .id) +
theme_jjo()
Figure 1.5: Distribution of coordinates per seal
There is definitely something wrong with these coordinates (five seals would have crossed the equator…), but the representation of the track can also be improved! Here are the some points to explore:
longitude a part of the data seems to have a wrong sign, resulting in these distribution, that appear to be cut offlatitude, well this is ensure but maybe the same problem occursLet’s try to play on coordinates’ sign to see if we can display something that makes more sense.
# interactive map
leaflet() %>%
setView(lng = -122, lat = 50, zoom = 3) %>%
addTiles() %>%
addPolylines(lat = data_2016[.id == "ind_3449", abs(latitude_degs)],
lng = data_2016[.id == "ind_3449", -abs(longitude_degs)],
weight = 2)
Figure 1.6: An attempt to display the ind_3449’s track
I’ll better ask Roxanne!
# build dataset to check for missing values
dataPlot = melt(data_2016_filter[, .(.id, is.na(.SD)), .SDcol = -c(".id",
"rec#",
"date",
"time")])
# add the id of rows
dataPlot[, id_row := c(1:.N), by = c("variable",".id")]
# plot
ggplot(dataPlot, aes(x = variable, y = id_row, fill = value)) +
geom_tile() +
labs(x = "Attributes", y = "Rows") +
scale_fill_manual(values = c("white", "black"),
labels = c("Real", "Missing")) +
facet_wrap(.id ~ ., scales = "free_y") +
theme_jjo() +
theme(
legend.position = "top",
axis.text.x = element_text(angle = 45, hjust = 1),
legend.key = element_rect(colour = "black")
)
Figure 1.7: Check for missing value in 2016-individuals
Let’s have a look at what’s inside data_nes$data_2018:
# list structure
str(data_nes$year_2018, max.level = 1, give.attr = F, no.list = T)
## $ ind_2018070:Classes 'data.table' and 'data.frame': 22393 obs. of 42 variables:
## $ ind_2018072:Classes 'data.table' and 'data.frame': 29921 obs. of 42 variables:
## $ ind_2018074:Classes 'data.table' and 'data.frame': 38608 obs. of 42 variables:
## $ ind_2018080:Classes 'data.table' and 'data.frame': 19028 obs. of 42 variables:
A list of 4
data.frames, one for each seal
For convenience, we aggregate all 4 individuals into one dataset.
# combine all individuals
data_2018 = rbindlist(data_nes$year_2018, use.name = TRUE, idcol = TRUE)
# display
DT::datatable(data_2018[sample.int(.N,100),],options=list(scrollX=T))
# raw_data
data_2018[, .(
nb_days_recorded = uniqueN(as.Date(date)),
nb_dives = .N,
maxdepth_mean = mean(maxdepth),
dduration_mean = mean(dduration),
botttime_mean = mean(botttime),
pdi_mean = mean(pdi, na.rm=T)
), by =.id] %>%
sable(caption="Summary diving information relative to each 2018 individual",
digits=2)
| .id | nb_days_recorded | nb_dives | maxdepth_mean | dduration_mean | botttime_mean | pdi_mean |
|---|---|---|---|---|---|---|
| ind_2018070 | 232 | 22393 | 305.52 | 783.27 | 243.22 | 109.55 |
| ind_2018072 | 342 | 29921 | 357.86 | 876.96 | 278.02 | 104.90 |
| ind_2018074 | 371 | 38608 | 250.67 | 686.25 | 291.89 | 302.77 |
| ind_2018080 | 215 | 19028 | 296.50 | 867.69 | 339.90 | 103.51 |
Very nice dataset :)
# build dataset to check for missing values
dataPlot = melt(data_2018[, .(.id, is.na(.SD)), .SDcol = -c(".id",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"date")])
# add the id of rows
dataPlot[, id_row := c(1:.N), by = c("variable",".id")]
# plot
ggplot(dataPlot, aes(x = variable, y = id_row, fill = value)) +
geom_tile() +
labs(x = "Attributes", y = "Rows") +
scale_fill_manual(values = c("white", "black"),
labels = c("Real", "Missing")) +
facet_wrap(.id ~ ., scales = "free_y") +
theme_jjo() +
theme(
legend.position = "top",
axis.text.x = element_text(angle = 45, hjust = 1),
legend.key = element_rect(colour = "black")
)
Figure 2.1: Check for missing value in 2018-individuals
So far so good, only few variables seems to have missing values:
# table with percent
table_inter = data_2018[, lapply(.SD, function(x) {
round(length(x[is.na(x)]) * 100 / length(x), 1)
}), .SDcol = -c(
".id",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"date"
)]
# find which are different from 0
cond_inter = sapply(table_inter,function(x){x==0})
# display the percentages that are over 0
table_inter[,which(cond_inter) := NULL] %>%
sable(caption="Percentage of missing values per columns having missing values!")%>%
scroll_box(width = "100%")
| lightatsurf | lattenuation | euphoticdepth | thermoclinedepth | driftrate | benthicdivevertrate | cornerindex | foragingindex | verticalspeed90perc | verticalspeed95perc |
|---|---|---|---|---|---|---|---|---|---|
| 26.3 | 89 | 62.6 | 1.3 | 0.5 | 22.7 | 75.8 | 0.5 | 0.1 | 0.1 |
Ok, let’s have a look at all the data. But first, we have to remove outliers. Some of them are quiet easy to spot looking at the distribution of dive duration:
ggplot(data_2018[,.SD][,state:="Before"],
aes(x=dduration, fill = .id))+
geom_histogram(show.legend = FALSE)+
geom_vline(xintercept = 3000, linetype = "longdash") +
facet_grid(state~.id,
scales="free")+
labs(y="# of dives", x="Dive duration (s)")+
theme_jjo()
Figure 2.2: Distribution of dduration for each seal. The dashed line highlight the “subjective” threshold used to remove outliers (3000 sec)
ggplot(data_2018[dduration<3000,][][,state:="After"],
aes(x=dduration, fill = .id))+
geom_histogram(show.legend = FALSE)+
geom_vline(xintercept = 3000, linetype = "longdash") +
facet_grid(state~.id,
scales="free")+
labs(x="# of dives", y="Dive duration (s)")+
theme_jjo()
Figure 2.3: Same distribution of dduration for each seal but after removing any dduration > 3000 sec. The dashed line highlight the “subjective” threshold used to remove outliers
It seems muche better, so let’s remove any rows with dduration > 3000 sec.
# filter data
data_2018_filter = data_2018[dduration < 3000, ]
# nbrow removed
data_2018[dduration>= 3000,.(nb_row_removed = .N),by=.id] %>%
sable(caption = "# of rows removed by 2018-individuals")
| .id | nb_row_removed |
|---|---|
| ind_2018070 | 3 |
| ind_2018072 | 1 |
| ind_2018074 | 33 |
names_display = names(data_2018_filter[, -c(
".id",
"date",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"euphoticdepth",
"thermoclinedepth",
"day_departure"
)])
for (i in names_display) {
cat('####', i, '{-} \n')
if (i == "maxdepth") {
print(
ggplot() +
geom_point(
data = data_2018_filter[, .(.id,
date,
thermoclinedepth)],
aes(
x = as.Date(date),
y = -thermoclinedepth,
colour = "Thermocline (m)"
),
alpha = .2,
size = .5
) +
geom_point(
data = data_2018_filter[, .(.id,
date,
euphoticdepth)],
aes(
x = as.Date(date),
y = -euphoticdepth,
colour = "Euphotic (m)"
),
alpha = .2,
size = .5
) +
scale_colour_manual(
values = c("Thermocline (m)" = 'red',
"Euphotic (m)" = "black"),
name = "Zone"
) +
new_scale_color() +
geom_point(
data = melt(data_2018_filter[, .(.id, date, get(i))], id.vars = c(".id", "date")),
aes(
x = as.Date(date),
y = -value,
col = .id
),
alpha = 1 / 10,
size = .5,
show.legend = FALSE
) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = "Maximum Depth (m)") +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position="bottom")
)
cat("<blockquote> Considering `ind_2018074` has slightly different values than other individuals for the thermocline depth, it would be interesting to see where the animal went. </blockquote>")
} else {
print(
ggplot(
data = melt(data_2018_filter[, .(.id, date, get(i))], id.vars = c(".id", "date")),
aes(
x = as.Date(date),
y = value,
col = .id
)
) +
geom_point(
show.legend = FALSE,
alpha = 1 / 10,
size = .5
) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = i) +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
}
cat(' \n \n')
}
Considering ind_2018074 has slightly different values than other individuals for the thermocline depth, it would be interesting to see where the animal went.
Few questions, that I should look into it:
- is the bimodal distribution of
dduration,desctimedue to nycthemeral migration?- is the bimodal distribution of
descrate(especially forind2018070andind_2018072) due to drift dive?- is
lightatbottcould be used to identify bioluminescence, cause it seems there is a lot going on at the bottom?- are the variations observed for
lightatsurfis due to moon cycle?- not sure why is there a bimodal distribution of
tempatbott!drifratethat one is awesome, since we can clearly see a pattern of how driftrate (and so buoyancy) change according time, but it is noisy due to negative and positive values occurring even at the beginning of the trip. I guess you measure driftrate also in ascent phases? That figure illustrate something we also found on Southern elephant seals, that drift rate varies at the descent, but rarely at the ascent. If I well remember, to maintain a constant drift rate in ascent phase during the whole trip, Southern elephant adjust their pitch (diving angle)- the bimodal distribution of
verticalspeed90andverticalspeed95should be due to drift dive.
for (i in names_display) {
cat('####', i, '{-} \n')
if (i == "maxdepth") {
print(
ggplot() +
geom_point(
data = data_2018_filter[day_departure < 32, .(.id,
day_departure,
thermoclinedepth)],
aes(
x = day_departure,
y = -thermoclinedepth,
colour = "Thermocline (m)",
group = day_departure
),
alpha = .2,
size = .5
) +
geom_point(
data = data_2018_filter[day_departure < 32, .(.id,
day_departure,
euphoticdepth)],
aes(
x = day_departure,
y = -euphoticdepth,
colour = "Euphotic (m)",
group = day_departure
),
alpha = .2,
size = .5
) +
scale_colour_manual(
values = c("Thermocline (m)" = 'red',
"Euphotic (m)" = "black"),
name = "Zone"
) +
new_scale_color() +
geom_boxplot(
data = melt(data_2018_filter[day_departure < 32, .(.id, day_departure, get(i))], id.vars = c(".id", "day_departure")),
aes(
x = day_departure,
y = -value,
col = .id,
group = day_departure
),
alpha = 1 / 10,
size = .5,
show.legend = FALSE
) +
facet_wrap(. ~ .id, scales = "free") +
labs(x = "# days since departure", y = "Maximum Depth (m)") +
theme_jjo() +
theme(legend.position="bottom")
)
} else {
print(
ggplot(
data = melt(data_2018_filter[day_departure < 32, .(.id, day_departure, get(i))], id.vars = c(".id", "day_departure")),
aes(
x = day_departure,
y = value,
color = .id,
group = day_departure
)
) +
geom_boxplot(
show.legend = FALSE,
alpha = 1 / 10,
size = .5
) +
facet_wrap(. ~ .id, scales = "free") +
labs(x = "# days since departure", y = i) +
theme_jjo()
)
}
cat(' \n \n')
}
Can we find nice correlation?
# compute correlation
corr_2018 = round(cor(data_2018_filter[, names_display, with = F],
use = "pairwise.complete.obs"), 1)
# replace NA value by 0
corr_2018[is.na(corr_2018)] = 0
# compute p_values
corr_p_2018 = cor_pmat(data_2018_filter[, names_display, with = F])
# replace NA value by 0
corr_p_2018[is.na(corr_p_2018)] = 1
# display
ggcorrplot(
corr_2018,
p.mat = corr_p_2018,
hc.order = TRUE,
method = "circle",
type = "lower",
ggtheme = theme_jjo(),
sig.level = 0.05,
colors = c("#00AFBB", "#E7B800", "#FC4E07")
)
Figure 2.4: Correlation matrix (crosses indicate non significant correlation)
Another way to see it:
# flatten correlation matrix
cor_result_2018 = flat_cor_mat(corr_2018, corr_p_2018)
# keep only the one above .7
cor_result_2018[cor>=.7,][order(-abs(cor))] %>%
sable(caption="Pairwise correlation above 0.75 and associated p-values")
| row | column | cor | p |
|---|---|---|---|
| verticalspeed90perc | verticalspeed95perc | 1.0 | 0 |
| maxdepth | asctime | 0.8 | 0 |
| botttime | efficiency | 0.8 | 0 |
| dwigglesbott | foragingindex | 0.8 | 0 |
| maxdepth | dduration | 0.7 | 0 |
| maxdepth | desctime | 0.7 | 0 |
| dduration | desctime | 0.7 | 0 |
| dduration | asctime | 0.7 | 0 |
| totvertdistbot | bottrange | 0.7 | 0 |
| totvertdistbot | verticalspeed90perc | 0.7 | 0 |
| totvertdistbot | verticalspeed95perc | 0.7 | 0 |
I guess nothing unexpected here, I’ll have to check with Patrick about the efficiency ;)
# dataset to plot proportional area plot
data_2018_filter[, sum_id := .N, by = .(.id, day_departure)][, sum_id_days := .N, by = .(.id, day_departure, divetype)][, prop := sum_id_days /
sum_id]
dataPlot = unique(data_2018_filter[, .(prop, .id, divetype, day_departure)])
# area plot
ggplot(dataPlot, aes(
x = as.numeric(day_departure),
y = prop,
fill = as.character(divetype)
)) +
geom_area(alpha = 0.6 , size = 1) +
facet_wrap(.id ~ ., scales = "free") +
theme_jjo() +
theme(legend.position="bottom") +
labs(x="# of days since departure", y="Proportion of dives", fill = "Dive types")
Figure 2.5: Proportion dive types
# plot
ggplot(data = data_2018_filter, aes(y = dduration, x = maxdepth, col = .id)) +
geom_point(size = .5, alpha = .1, show.legend = FALSE) +
facet_wrap(.id ~ .) +
labs(x="Maximum depth (m)", y="Dive duration (s)")+
theme_jjo()
Figure 2.6: Dive duration vs. Maximum Depth colored 2018-individuals
# plot
ggplot(data = data_2018_filter, aes(y = dduration, x = maxdepth, col = divetype)) +
geom_point(size = .5, alpha = .1) +
facet_wrap(.id ~ .) +
guides(colour = guide_legend(override.aes = list(size = 5, alpha = 1))) +
labs(x="Maximum depth (m)", y="Dive duration (s)")+
theme_jjo() +
theme(legend.position="bottom")
Figure 2.7: Dive duration vs. Maximum Depth colored by Dive Type
# plot
ggplot(data = data_2018_filter[,prop_track := (day_departure*100)/max(day_departure),by=.id], aes(y = dduration, x = maxdepth, col = prop_track)) +
geom_point(size = .5, alpha = .1) +
facet_wrap(.id ~ .) +
labs(x="Maximum depth (m)", y="Dive duration (s)", col="Proportion of completed track (%)")+
scale_color_continuous(type = "viridis")+
theme_jjo() +
theme(legend.position="bottom")
Figure 2.8: Dive duration vs. Maximum Depth colored by # days since departure
There seems to be a patch for high depths (especially visible for
ind2018070), but I don’t know what it could be linked to…
Not sure about these graphs, especialy if driftrate is calculated during both ascent and descent phases.
# plot
ggplot(data_2018_filter[,.(driftrate=median(driftrate,na.rm=T),
botttime=median(botttime,na.rm=T),
maxdepth=median(maxdepth,na.rm=T),
dduration=median(dduration,na.rm=T)), by=.(.id,day_departure)],
aes(x=botttime, y=driftrate, col=.id))+
geom_point(size=.5,alpha=.5)+
geom_smooth(method="lm")+
guides(color=FALSE)+
facet_wrap(.id~.)+
scale_x_continuous(limits = c(0,700))+
labs(x = "Daily median Bottom time (s)", y = "Daily median drift rate (m.s-1)")+
theme_jjo()
Figure 2.9: Drift rate vs. Bottom time
# plot
ggplot(data_2018_filter[,.(driftrate=median(driftrate,na.rm=T),
botttime=median(botttime,na.rm=T),
maxdepth=median(maxdepth,na.rm=T),
dduration=median(dduration,na.rm=T)), by=.(.id,day_departure)],
aes(x=maxdepth, y=driftrate, col=.id))+
geom_point(size=.5,alpha=.5)+
geom_smooth(method="lm")+
guides(color=FALSE)+
facet_wrap(.id~.)+
labs(x = "Daily median Maximum depth (m)", y = "Daily median drift rate (m.s-1)")+
theme_jjo()
Figure 2.10: Drift rate vs. Maximum depth
# plot
ggplot(data_2018_filter[,.(driftrate=median(driftrate,na.rm=T),
botttime=median(botttime,na.rm=T),
maxdepth=median(maxdepth,na.rm=T),
dduration=median(dduration,na.rm=T)), by=.(.id,day_departure)],
aes(x=dduration, y=driftrate, col=.id))+
geom_point(size=.5,alpha=.5)+
geom_smooth(method="lm")+
guides(color=FALSE)+
facet_wrap(.id~.)+
labs(x = "Daily median Dive duration (s)", y = "Daily median drift rate (m.s-1)")+
theme_jjo()
Figure 2.11: Drift rate vs. Dive duration